1 Source libraries, functions and plotting parameters

library(here)

source(here::here("code/scripts/20210628_source.R"))

2 Get full list of traits, associations and studies from the GWAS Catalog

2.1 Traits

date_of_collection = "20210628"
long_term_dir = "/nfs/research/birney/users/ian/hmn_fst/gwasrapidd"
out_file = here::here("code/snakemake/20210625/config",
                      paste(date_of_collection, "_all_traits.tsv", sep = ""))

# Get list of all traits in database
all_traits = gwasrapidd::get_traits()

# Filter out IDs that cause an error
all_traits_tbl = all_traits@traits %>% 
  dplyr::filter(efo_id != "CHEBI:7916") %>%
  # Save to file
  readr::write_tsv(out_file)

2.2 Associations

#Need to do them sequentially because the GWAS Catalog can't handle dozens (let alone thousands) of simultaneous queries.
date_of_collection = "20210628"
long_term_dir = "/nfs/research/birney/users/ian/hmn_fst/gwasrapidd"


counter = 0
lapply(all_traits@traits$efo_id, function(EFO_ID) {
  counter <<- counter + 1
  # set output file name
  out_path = file.path(long_term_dir,
                       date_of_collection,
                       "associations_raw",
                       paste(EFO_ID, ".rds", sep = ""))
  # if output file doesn't already exist, get associations and save
  if (!file.exists(out_path)){
    out = gwasrapidd::get_associations(efo_id = EFO_ID)
    saveRDS(out, out_path)
  } else {
    print(paste(counter, ". ", EFO_ID, ": File already exists.", sep = ""))
  }
  # save
})

2.3 Read in associations and filter for those with more than 50 unique SNPs

date_of_collection = "20210628"
long_term_dir = "/nfs/research/birney/users/ian/hmn_fst/gwasrapidd"
out_file = here::here("code/snakemake/20210625/config",
                      paste(date_of_collection, "_filtered_traits.tsv", sep = ""))

all_assocs = lapply(all_traits_tbl$efo_id, function(EFO_ID){
  in_path = file.path(long_term_dir,
                      date_of_collection,
                      "associations_raw",
                      paste(EFO_ID, ".rds", sep = ""))
  readRDS(in_path)
})
names(all_assocs) = all_traits_tbl$efo_id

# filter for those with more than 50 unique SNPs
filt_assocs = all_assocs %>% 
  purrr::keep(function(EFO_ID){
    length(unique(EFO_ID@risk_alleles$variant_id)) >= 50
  })

# Filter `all_traits_tbl` for those traits and save to file
filt_traits_tbl = all_traits_tbl %>% 
  dplyr::filter(efo_id %in% names(filt_assocs)) %>% 
  readr::write_tsv(out_file)

3 Run Snakemake pipeline to extract data

Full Snakemake pipeline here: https://github.com/brettellebi/human_traits_fst/tree/master/code/snakemake/20210625

4 Read in processed data

target_dir = "/nfs/research/birney/users/ian/hmn_fst/gwasrapidd/20210628"

# Fst
fst_all_path = file.path(target_dir, "pegas/fst/consol/all.rds")
fst_all = readRDS(fst_all_path) %>%
  lapply(., function(TRAIT_ID){
    # convert to DF
    TRAIT_ID %>%
        data.frame(.) %>% 
        tibble::rownames_to_column(var = "SNP") %>% 
        dplyr::select(SNP, FST_PEGAS = Fst)
  })

# Bind into df
fst_all_df = fst_all %>% 
  dplyr::bind_rows(.id = "EFO_ID") %>% 
  dplyr::left_join(all_traits_tbl %>% 
                     dplyr::select(EFO_ID = efo_id,
                                   TRAIT = trait),
                   by = "EFO_ID")

# Clumped SNPs
## Note that for some traits, no SNPs met the p-value threshold, so Plink1.9 did not produce a `*.clumped` file
## There are therefore fewer files than there are traits
clumped_files = list.files(file.path(target_dir, "plink/clumped"), pattern = "*.clumped", full.names = T)
clumped = lapply(clumped_files, function(FILE){
  readr::read_delim(FILE,
                    delim = " ",
                    col_types = "i-c-d-------",
                    trim_ws = T)
})
names(clumped) = stringr::str_remove(basename(clumped_files), ".clumped")


# Filter `fst_all` for clumped SNPs
fst_clumped = lapply(names(fst_all), function(TRAIT_ID){
    fst_all[[TRAIT_ID]] %>% 
      dplyr::filter(SNP %in% clumped[[TRAIT_ID]]$SNP)
})
names(fst_clumped) = names(fst_all)
# Bind into DF
fst_clumped_df = fst_clumped %>% 
  dplyr::bind_rows(.id = "EFO_ID") %>% 
  dplyr::left_join(all_traits_tbl %>% 
                     dplyr::select(EFO_ID = efo_id,
                                   TRAIT = trait),
                   by = "EFO_ID")

4.1 Add palette based on pre-clump SNP count

pal_all = turbo(n = length(unique(fst_all_df$EFO_ID)))
names(pal_all) = fst_all_df %>% 
  dplyr::count(TRAIT) %>% 
  dplyr::arrange(desc(n)) %>% 
  dplyr::pull(TRAIT)

5 SNP counts pre- and post-clumping

snp_count_plot = list("PRE-CLUMP" = fst_all_df,
                      "POST-CLUMP" = fst_clumped_df) %>% 
  dplyr::bind_rows(.id = "FILTER") %>% 
  # order variables
  dplyr::mutate(TRAIT = factor(TRAIT, levels = names(pal_all)),
                FILTER = factor(FILTER, levels = c("PRE-CLUMP", "POST-CLUMP"))) %>% 
  ggplot() +
    geom_bar(aes(TRAIT, fill = TRAIT)) +
    scale_fill_manual(values = pal_all) +
    theme_bw() +
    theme(axis.text.x = element_blank(),
          axis.title.x = element_blank(),
          axis.ticks.x = element_blank()) +
    ylab("N LOCI") +
    facet_wrap(~FILTER, nrow = 2) +
    guides(fill = "none")

plotly::ggplotly(snp_count_plot,
                 tooltip = c("TRAIT", "y"))
width = 12
height = 10

ggsave(here::here("docs/plots/20210722_snp_counts.png"),
       plot = snp_count_plot,
       device = "png",
       width = width,
       height = height,
       units = "in",
       dpi= 400)

ggsave(here::here("docs/plots/20210722_snp_counts.svg"),
       plot = snp_count_plot,
       device = "svg",
       width = width,
       height = height,
       units = "in")

5.1 Table of post-clump traits

fst_clumped_df %>% 
  dplyr::count(EFO_ID, TRAIT) %>% 
  dplyr::rename(N_SNPS = n) %>% 
  dplyr::arrange(desc(N_SNPS)) %>% 
  DT::datatable(., options = list(pageLength = 10))

6 eCDF for \(F_{ST}\)

6.1 New palette for clumped traits

## Add palette based on post-clump SNP count
pal_clump = turbo(n = length(unique(fst_clumped_df$EFO_ID)))
names(pal_clump) = fst_clumped_df %>% 
  dplyr::count(EFO_ID) %>% 
  dplyr::arrange(desc(n)) %>% 
  dplyr::pull(EFO_ID)

# order `TRAIT` by `EFO_ID` 
od_clumped_traits = fst_clumped_df %>% 
  dplyr::select(EFO_ID, TRAIT) %>% 
  dplyr::distinct() %>% 
  dplyr::left_join(data.frame(EFO_ID = names(pal_clump)),
                   .,
                   by = "EFO_ID") %>% 
  dplyr::pull(TRAIT)

# make names correspond to trait
names(pal_clump) = od_clumped_traits

6.2 Plot all

ecdf_all_faceted = fst_clumped_df %>% 
  dplyr::group_by(EFO_ID) %>%
  # filter for those with >0 SNPs in the clumped dataset
  dplyr::filter(EFO_ID %in% unique(fst_clumped_df$EFO_ID)) %>% 
  dplyr::ungroup() %>% 
  # order
  dplyr::mutate(TRAIT = factor(TRAIT, levels = od_clumped_traits)) %>% 
  ggplot() +
    stat_ecdf(aes(FST_PEGAS, colour = TRAIT)) +
    scale_colour_manual(values = pal_clump) +
    theme_bw() +
    ggtitle("eCDF") +
    facet_wrap(~TRAIT) +
    guides(colour = "none") +
    theme(strip.text.x = element_text(size = 4.5))


ggsave(here::here("docs/plots/20210701_ecdf_all_faceted.png"),
       plot = ecdf_all_faceted, 
       device = "png",
       width = 30,
       height = 22,
       units = "in",
       dpi = 400)
knitr::include_graphics(here::here("docs/plots/20210701_ecdf_all_faceted.png"))

6.3 Plotly

6.3.1 All

plotly_ecdf_all = fst_clumped_df %>% 
  # order FST_PEGAS so that it's properly plotted by plotly
  dplyr::arrange(EFO_ID, FST_PEGAS) %>% 
  dplyr::group_by(EFO_ID) %>%
  # filter for those with >0 SNPs in the clumped dataset
  dplyr::filter(EFO_ID %in% unique(fst_clumped_df$EFO_ID)) %>% 
  dplyr::ungroup() %>% 
  # order
  dplyr::mutate(TRAIT = factor(TRAIT, levels = od_clumped_traits)) %>% 
  ggplot() +
    stat_ecdf(aes(FST_PEGAS, colour = TRAIT)) +
    scale_colour_manual(values = pal_clump) +
    theme_bw() +
    ggtitle("eCDF") +
    guides(colour = "none") +
    theme(strip.text.x = element_text(size = 5))

plotly::ggplotly(plotly_ecdf_all,
                 tooltip = c("TRAIT"))
## Warning: Removed 2 rows containing non-finite values (stat_ecdf).

6.3.2 Select

# get traits to look closer at
target_traits = c("body height",
                  "body mass index",
                  "mathematical ability",
                  "intelligence",
                  "self reported educational attainment",
                  "type ii diabetes mellitus",
                  "asthma",
                  "HIV-1 infection",
                  "viral load", 
                  "heart failure",
                  "diabetes mellitus",
                  "coronary artery disease",
                  "mortality",
                  "longevity",
                  "schizophrenia",
                  "skin pigmentation",
                  "skin pigmentation measurement",
                  "eye colour measurement",
                  "facial morphology measurement",
                  "melanoma",
                  "synophrys measurement",
                  "lip morphology measurement",
                  "squamous cell carcinoma",
                  "tuberculosis",
                  "endometrial carcinoma",
                  "BMI-adjusted hip circumference",
                  "alcohol dependence measurement",
                  "loneliness measurement"
                  )

plotly_ecdf_select = fst_clumped_df %>%
  # filter for `target_traits`
  dplyr::filter(TRAIT %in% target_traits) %>%
  # take unique SNPs
  dplyr::group_by(EFO_ID) %>%
  dplyr::distinct(SNP, .keep_all = T) %>% 
  dplyr::ungroup() %>% 
  # order FST_PEGAS so that it's properly plotted by plotly
  dplyr::arrange(EFO_ID, FST_PEGAS) %>% 
  # order
  dplyr::mutate(TRAIT = factor(TRAIT, levels = od_clumped_traits)) %>% 
  ggplot() +
    stat_ecdf(aes(FST_PEGAS, colour = TRAIT)) +
    scale_colour_manual(values = pal_clump) +
    theme_bw() +
    ggtitle("eCDF") +
    guides(colour = "none") +
    theme(strip.text.x = element_text(size = 5))

plotly::ggplotly(plotly_ecdf_select,
                 tooltip = c("TRAIT"))
## Warning: Removed 1 rows containing non-finite values (stat_ecdf).

7 Extract same number of top SNPs for selected traits as skin pigmentation

To check whether the distribution changes if you only have a few SNPs with presumably the highest effect sizes.

# How many unique SNPs for each trait
fst_clumped_df %>% 
  dplyr::filter(TRAIT %in% target_traits) %>% 
  dplyr::group_by(TRAIT) %>% 
  dplyr::summarise(SNP_COUNT = n_distinct(SNP)) %>% 
  DT::datatable(.)
# Get highest number of SNPs of the pigmentation traits
max_n_snps = fst_clumped_df %>% 
  dplyr::filter(TRAIT %in% target_traits) %>% 
  dplyr::group_by(TRAIT) %>% 
  dplyr::summarise(SNP_COUNT = n_distinct(SNP)) %>% 
  dplyr::filter(TRAIT %in% c("skin pigmentation measurement", "skin pigmentation", "eye colour measurement")) %>% 
  dplyr::pull(SNP_COUNT) %>% 
  max(.)

# Get EFO IDs for target traits
target_efo_ids = fst_clumped_df %>% 
  dplyr::filter(TRAIT %in% target_traits) %>% 
  dplyr::distinct(EFO_ID) %>% 
  dplyr::pull(EFO_ID)

# Filter `clumped` for `target_traits` and then pull out the top `max_n_snps` for each trait
clumped_red = clumped[names(clumped) %in% target_efo_ids] %>% 
  dplyr::bind_rows(.id = "EFO_ID") %>% 
  dplyr::arrange(EFO_ID, P) %>% 
  dplyr::group_by(EFO_ID) %>%
  dplyr::distinct(SNP, .keep_all = T) %>% 
  dplyr::slice_head(n = max_n_snps) %>% 
  split(., f = .$EFO_ID)

# Filter `fst_all` for clumped SNPs
fst_clumped_red = fst_all[names(fst_all) %in% target_efo_ids]
clumped_red_names = names(fst_clumped_red)
fst_clumped_red = lapply(names(fst_clumped_red), function(TRAIT_ID){
    fst_all[[TRAIT_ID]] %>% 
      dplyr::filter(SNP %in% clumped_red[[TRAIT_ID]]$SNP)
})
names(fst_clumped_red) = clumped_red_names
# Bind into DF
fst_clumped_df_red = fst_clumped_red %>% 
  dplyr::bind_rows(.id = "EFO_ID") %>% 
  dplyr::left_join(all_traits_tbl %>% 
                     dplyr::select(EFO_ID = efo_id,
                                   TRAIT = trait),
                   by = "EFO_ID")

# Plot
plotly_ecdf_select_2 = fst_clumped_df_red %>% 
  # order FST_PEGAS so that it's properly plotted by plotly
  dplyr::arrange(EFO_ID, FST_PEGAS) %>% 
  # order
  dplyr::mutate(TRAIT = factor(TRAIT, levels = od_clumped_traits)) %>% 
  ggplot() +
    stat_ecdf(aes(FST_PEGAS, colour = TRAIT)) +
    scale_colour_manual(values = pal_clump) +
    theme_bw() +
    ggtitle("eCDF") +
    guides(colour = "none") +
    theme(strip.text.x = element_text(size = 5))

plotly::ggplotly(plotly_ecdf_select_2,
                 tooltip = c("TRAIT"))

8 Were the HIV and viral load studies carried out in African populations?

# Get studies
hiv_traits = c("viral load", "HIV-1 infection")
hiv_efo_ids = all_traits_tbl %>% 
  dplyr::filter(trait %in% hiv_traits)
assocs_dir = "/nfs/research/birney/users/ian/hmn_fst/gwasrapidd/20210628/associations_raw"
hiv_studies = lapply(hiv_efo_ids$efo_id, function(HIV_EFO){
  out = list()
  # read in associations
  out[["ASSOCS"]] = readRDS(file.path(assocs_dir, paste(HIV_EFO, ".rds", sep = "")))
  out[["KEY"]] = gwasrapidd::association_to_study(unique(out[["ASSOCS"]]@associations$association_id))
  out[["STUDIES"]] = gwasrapidd::get_studies(study_id = unique(out[["KEY"]]$study_id))
  return(out)
})
names(hiv_studies) = hiv_efo_ids$trait

# Print studies
hiv_out = purrr::map(hiv_studies, function(HIV_TRAIT){
  HIV_TRAIT[["STUDIES"]]@studies %>% 
    dplyr::distinct(study_id,
                    initial_sample_size,
                    replication_sample_size) %>% 
    dplyr::left_join(HIV_TRAIT[["STUDIES"]]@publications %>% 
                       dplyr::select(study_id, title),
                     by = "study_id") %>% 
    dplyr::select(study_id, title, initial_sample_size, replication_sample_size)
})
 
# HIV-1 infection
DT::datatable(hiv_out$`HIV-1 infection`)
# Viral load
DT::datatable(hiv_out$`viral load`)

9 Final figures

9.1 Ridges

ridges_plot = fst_clumped_df %>% 
  # filter for `target_traits`
  dplyr::filter(TRAIT %in% target_traits) %>% 
  # group by trait to take unique SNPs
  dplyr::group_by(TRAIT) %>% 
  dplyr::distinct(SNP, .keep_all = T) %>% 
  dplyr::ungroup() %>% 
  # reverse order of traits to put `body height` at the top
  dplyr::mutate(TRAIT_REV = factor(TRAIT, levels = rev(names(pal_clump)))) %>% 
  # plot
  ggplot(aes(FST_PEGAS, TRAIT_REV, fill = TRAIT_REV, colour = TRAIT_REV)) +
    geom_density_ridges(scale = 2,
                        bandwidth = 0.003,
                        calc_ecdf = TRUE,
                        quantiles = c(0.5, 0.9),
                        quantile_lines = T,
                        jittered_points = T,
                        point_shape = '|', alpha = 0.85, point_size = 2,
                        position = position_points_jitter(height = 0),
                        ) +
    scale_fill_manual(values = pal_clump) +
    scale_colour_manual(values = darker(pal_clump)) +
    guides(fill = "none", colour = "none") +
    theme_cowplot() +
    scale_y_discrete(expand = expansion(add = c(0.2, 2.3))) +
    xlab(expression(italic(F[ST]))) +
    ylab(NULL)

ridges_plot
## Warning: Removed 1 rows containing non-finite values (stat_density_ridges).

width = 8
height = 20

ggsave(here::here("docs/plots/20210721_ridges.png"),
       device = "png",
       width = width,
       height = height,
       units = "in",
       dpi= 400)

ggsave(here::here("docs/plots/20210721_ridges.svg"),
       device = "svg",
       width = width,
       height = height,
       units = "in")

9.2 eCDF plot faceted

ecdf_plot_face = fst_clumped_df %>%
  # filter for `target_traits`
  dplyr::filter(TRAIT %in% target_traits) %>%
  # take unique SNPs
  dplyr::group_by(EFO_ID) %>%
  dplyr::distinct(SNP, .keep_all = T) %>% 
  dplyr::ungroup() %>% 
  # order FST_PEGAS so that it's properly plotted by plotly
  dplyr::arrange(EFO_ID, FST_PEGAS) %>% 
  # order
  dplyr::mutate(TRAIT = factor(TRAIT, levels = od_clumped_traits)) %>% 
  ggplot() +
    stat_ecdf(aes(FST_PEGAS, colour = TRAIT)) +
    scale_colour_manual(values = pal_clump) +
    theme_cowplot(rel_small = 9/14) +
    guides(colour = "none") +
    theme(strip.text.x = element_text(size = 5)) +
    xlab(expression(italic(F[ST]))) +
    ylab("Cumulative Probability") +
    facet_wrap(~TRAIT, nrow = 7, ncol = 4) 

ecdf_plot_face
## Warning: Removed 1 rows containing non-finite values (stat_ecdf).

ecdf_plot_all = fst_clumped_df %>%
  # filter for `target_traits`
  dplyr::filter(TRAIT %in% target_traits) %>%
  # take unique SNPs
  dplyr::group_by(EFO_ID) %>%
  dplyr::distinct(SNP, .keep_all = T) %>% 
  dplyr::ungroup() %>% 
  # order FST_PEGAS so that it's properly plotted by plotly
  dplyr::arrange(EFO_ID, FST_PEGAS) %>% 
  # order
  dplyr::mutate(TRAIT = factor(TRAIT, levels = names(pal_clump)[names(pal_clump) %in% target_traits])) %>% 
  ggplot() +
    stat_ecdf(aes(FST_PEGAS, colour = TRAIT)) +
    scale_colour_manual(values = pal_clump) +
    theme_cowplot() +
    guides(colour = "none") +
    theme(strip.text.x = element_text(size = 5)) +
    xlab(expression(italic(F[ST]))) +
    ylab("Cumulative Probability")

ecdf_plot_all
## Warning: Removed 1 rows containing non-finite values (stat_ecdf).

9.3 Compile

final_figure = cowplot::ggdraw() +
  draw_plot(ridges_plot,
            x = 0, y = 0, width = .5, height = 1) +
  draw_plot(ecdf_plot_face,
            x = .5, y = 0.35, width = .5, height = .65) +
  draw_plot(ecdf_plot_all,
            x = .5, y = 0, width = .5, height = .35) +
  draw_plot_label(label = c("A", "B", "C"), size = 25,
                  x = c(0, .5, .5), y = c(1, 1, .35),
                  hjust = c(-.15, .15, .15),
                  color = "#37323E")
## Warning: Removed 1 rows containing non-finite values (stat_density_ridges).
## Warning: Removed 1 rows containing non-finite values (stat_ecdf).

## Warning: Removed 1 rows containing non-finite values (stat_ecdf).
final_figure

width = 12
height = 12

ggsave(here::here("docs/plots/20210721_final.png"),
       device = "png",
       width = width,
       height = height,
       units = "in",
       dpi= 400)

ggsave(here::here("docs/plots/20210721_final.svg"),
       device = "svg",
       width = width,
       height = height,
       units = "in")